The goal

Demonstrate data acquisition, data preparation, data analysis and visualization.

(For required files: https://github.com/RamBarankin/Solar_Market.git).

Data types

The exercise

Find the best place to invest in current patterns according to local climate, current electricity prices, and attitude toward climate change.

1. Load general packages

library(XML)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

2. Find lat/long of cities (see GIS shapefile in the folder)

library(raster)
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
cities <- tbl_df(shapefile("Solar_Market/US_cities/US_cities.shp")) 
cities <- cities %>%  plyr::rename(c("ycoord"="LAT")) %>% plyr::rename(c("xcoord"="LONG")) %>% 
  filter(CLASS=="city") %>% dplyr::select(NAME, ST, LAT, LONG)

This is the resulted dataframe:

## # A tibble: 2,561 × 4
##                NAME    ST      LAT      LONG
##               <chr> <chr>    <dbl>     <dbl>
## 1     Bullhead City    AZ 35.12057 -114.5461
## 2  Lake Havasu City    AZ 34.50291 -114.3136
## 3          San Luis    AZ 32.48106 -114.7237
## 4          Somerton    AZ 32.60101 -114.7000
## 5              Yuma    AZ 32.55333 -114.5155
## 6          Adelanto    CA 34.58686 -117.4412
## 7      Agoura Hills    CA 34.15363 -118.7601
## 8           Alameda    CA 37.77993 -122.2783
## 9            Albany    CA 37.88984 -122.3002
## 10         Alhambra    CA 34.08398 -118.1355
## # ... with 2,551 more rows

3. Get average electricity prices

library(RCurl)
## Loading required package: bitops
data <- getURL("https://www.eia.gov/electricity/monthly/epm_table_grapher.cfm?t=epmt_5_6_a", ssl.verifypeer = FALSE)
perftable <- readHTMLTable(data[[1]], stringsAsFactors = T)
elctprice <- perftable[[2]]
elctprice <- tbl_df(elctprice[,c(1,10)])
colnames(elctprice) <- c("State", "Price")

This is the resulted dataframe:

## # A tibble: 62 × 2
##              State  Price
##             <fctr> <fctr>
## 1      New England  15.85
## 2      Connecticut  16.73
## 3            Maine  13.08
## 4    Massachusetts  16.19
## 5    New Hampshire  16.00
## 6     Rhode Island  15.62
## 7          Vermont  14.35
## 8  Middle Atlantic  12.20
## 9       New Jersey  12.91
## 10        New York  13.97
## # ... with 52 more rows

4. Add State’s abbreviation (to have a common column for the two dataframes)

us <- readHTMLTable("http://www.softschools.com/social_studies/state_abbreviations/",  
                    header=T, which=1,stringsAsFactors=F, trim = T, skip.rows = 3)
colnames(us) <- us[3,]
us <- tbl_df(us[4:nrow(us),])
elctprice <- elctprice %>% mutate(State=toupper(State)) %>%
  left_join(us) %>% dplyr::select(Abbreviation, Price) %>% plyr::rename(c("Abbreviation"="ST")) %>% na.omit()
## Joining, by = "State"

This is the resulted dataframe:

## # A tibble: 50 × 2
##       ST  Price
##    <chr> <fctr>
## 1     CT  16.73
## 2     ME  13.08
## 3     MA  16.19
## 4     NH  16.00
## 5     RI  15.62
## 6     VT  14.35
## 7     NJ  12.91
## 8     NY  13.97
## 9     PA  10.19
## 10    IL   8.91
## # ... with 40 more rows

5. Merge all and produce the index

cities <- cities %>% left_join(elctprice)
## Joining, by = "ST"
cities$Price <- as.numeric(cities$Price)
cities <- cities %>% na.omit() %>% mutate(idx_SUN = scale(LAT), idx_SUN=idx_SUN+abs(min(idx_SUN)),
                            idx_PRICE = scale(Price), idx_PRICE=idx_PRICE+abs(min(idx_PRICE)),
                            Market_Potential_Index = idx_SUN+idx_PRICE)
Market_Potential <- quantile(cities$Market_Potential_Index)
for (i in 1:nrow(cities)){
if (cities$Market_Potential_Index[i]<=Market_Potential[2]){cities$Market_Potential[i]<-"High"} else{
  if(cities$Market_Potential_Index[i]>Market_Potential[2]&cities$Market_Potential_Index[i]<=Market_Potential[3]){ cities$Market_Potential[i]<-"Medium"} else{
    cities$Market_Potential[i]<-"Low"}}}
## Warning: Unknown column 'Market_Potential'
cities$Market_Potential_Index <- -cities$Market_Potential_Index+9

This is the resulted dataframe:

## # A tibble: 2,560 × 9
##                NAME    ST      LAT      LONG Price  idx_SUN idx_PRICE
##               <chr> <chr>    <dbl>     <dbl> <dbl>    <dbl>     <dbl>
## 1     Bullhead City    AZ 35.12057 -114.5461    49 2.023794 2.6829421
## 2  Lake Havasu City    AZ 34.50291 -114.3136    49 1.905407 2.6829421
## 3          San Luis    AZ 32.48106 -114.7237    49 1.517880 2.6829421
## 4          Somerton    AZ 32.60101 -114.7000    49 1.540871 2.6829421
## 5              Yuma    AZ 32.55333 -114.5155    49 1.531733 2.6829421
## 6          Adelanto    CA 34.58686 -117.4412    14 1.921498 0.7266302
## 7      Agoura Hills    CA 34.15363 -118.7601    14 1.838461 0.7266302
## 8           Alameda    CA 37.77993 -122.2783    14 2.533513 0.7266302
## 9            Albany    CA 37.88984 -122.3002    14 2.554579 0.7266302
## 10         Alhambra    CA 34.08398 -118.1355    14 1.825110 0.7266302
## # ... with 2,550 more rows, and 2 more variables:
## #   Market_Potential_Index <dbl>, Market_Potential <chr>

6. Map

df <- cities
df <- df%>% plyr::rename(c("Market_Potential_Index"="Solar Market Potential index"))%>%
  plyr::rename(c("Market_Potential"="Market Potential Level")) %>% 
  plyr::rename(c("Price"="Electricity Price (Cents per KW)"))
map <- shapefile("Solar_Market//US_cities/US_cities.shp")
map <- merge(map, df)

map1 <- map[,c(1,2, 54, 57,58)]
map1 <- map1[na.omit(map1),]
library(mapview)
## Warning: package 'mapview' was built under R version 3.3.2
## Loading required package: leaflet
m <- mapview(map1, legend=T, zcol="Solar Market Potential index")

The map:

7. Discover attitudes using twitter

library(twitteR)
# setup_twitter_oauth (use yourown credentials)
cities <- cities[order(-cities$Market_Potential_Index),]
poten_cities <- cities[1:10,] # The cities with highest potential
# prepare Twitter parameters:
poten_cities$LatLong <- paste0(poten_cities$LAT, ",", poten_cities$LONG,
                               ", ", " 2mi") #location
date = as.character(Sys.Date()) # Today's date
poten_cities$Twitnum <- 0 # the variable to be used to count tweats
for (i in 1:nrow(poten_cities)){
  tw = searchTwitter('#climatechange', n = 1e4, since = date, geocode=poten_cities$LatLong[i])
  poten_cities$Twitnum[i] <- length(tw)
  } #get and store the # of tweats
poten_cities$Potential <- scale(poten_cities$Twitnum) #standardize the # of tweats 
poten_cities <- poten_cities[order(-poten_cities$Potential),]#order cities according to tweets

The resulting dataframe:

## # A tibble: 10 × 12
##             NAME    ST      LAT      LONG Price   idx_SUN idx_PRICE
##           <fctr> <chr>    <dbl>     <dbl> <dbl>     <dbl>     <dbl>
## 1    South Miami    FL 25.70605 -80.29535     1 0.2193168         0
## 2   Coral Gables    FL 25.66430 -80.27318     1 0.2113136         0
## 3      Homestead    FL 25.46640 -80.44723     1 0.1733832         0
## 4   Florida City    FL 25.44182 -80.46852     1 0.1686711         0
## 5       Key West    FL 24.56181 -81.76403     1 0.0000000         0
## 6  Miami Springs    FL 25.81952 -80.28949     1 0.2410653         0
## 7    Miami Beach    FL 25.81535 -80.13088     1 0.2402661         0
## 8          Doral    FL 25.80897 -80.35417     1 0.2390423         0
## 9          Miami    FL 25.79654 -80.20840     1 0.2366603         0
## 10    Sweetwater    FL 25.76496 -80.37184     1 0.2306083         0
## # ... with 5 more variables: Market_Potential_Index <dbl>,
## #   Market_Potential <chr>, LatLong <chr>, Twitnum <dbl>, Potential <dbl>

8. Visualize

library(plotly)
## Warning: package 'plotly' was built under R version 3.3.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:raster':
## 
##     select
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
poten_cities$NAME <- factor(poten_cities$NAME, levels = poten_cities[["NAME"]])

p <- plot_ly(poten_cities, x=~poten_cities$NAME ,y=~poten_cities$Potential, type="bar", 
        marker = list(color = 'rgb(158,202,225)', range ("-1", "-0.5", "Mean", "0.5", "1"),
                      line = list(color = 'rgb(8,48,107)', width = 1.5))) %>%
  layout(title = "Solar Market Potential - Twitter Analysis",
         xaxis = list(title = ""),
         yaxis = list(title = "#climatechange Tweets Relative to the Mean"))

The graph: